#title: "Project - Part II"
#student: Kaimi Huang (kh2908)
#objective of research: the objective of the research is to identify factors that relate to high systolic and diastolic blood pressures
library(magrittr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ stringr 1.4.0
## ✓ tidyr 1.2.0 ✓ forcats 0.5.1
## ✓ readr 2.1.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x tidyr::extract() masks magrittr::extract()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::set_names() masks magrittr::set_names()
library(ggplot2)
#comparison of (two) means t-test
#the first t-test compares the genders' blood pressure averages to see whether there is a significant differences
#load the gender data frame
gender <- read.csv("/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project/gender.csv")
dim(gender)
## [1] 11656 9
head(gender)
## X Respondent.sequence.number. Systolic Diastolic Reading
## 1 1 109264 109 67 1st reading
## 2 2 109266 99 56 1st reading
## 3 3 109270 123 73 1st reading
## 4 4 109271 102 65 1st reading
## 5 5 109273 116 68 1st reading
## 6 6 109274 138 70 1st reading
## sys_blood_pressure_level dia_blood_pressure_level gender gender_description
## 1 normal normal 2 female
## 2 normal normal 2 female
## 3 elevated normal 2 female
## 4 normal normal 1 male
## 5 normal normal 1 male
## 6 high normal 1 male
names(gender)
## [1] "X" "Respondent.sequence.number."
## [3] "Systolic" "Diastolic"
## [5] "Reading" "sys_blood_pressure_level"
## [7] "dia_blood_pressure_level" "gender"
## [9] "gender_description"
#for assumption, check that the sample size is greater than 40 for both genders
table(gender$gender_description)
##
## female male
## 5932 5724
#calculate the means and standard deviations of systolic blood pressure by gender
sum_stat_sys_by_gender <- gender %>%
select(Systolic, gender_description) %>%
#group by gender_description
group_by(gender_description) %>%
#calculate the means and sds
summarize(mean_sys = mean(Systolic, na.rm=TRUE), sd_sys = sd(Systolic, na.rm=TRUE))
print(sum_stat_sys_by_gender)
## # A tibble: 2 × 3
## gender_description mean_sys sd_sys
## <chr> <dbl> <dbl>
## 1 female 118. 20.8
## 2 male 122. 18.8
#calculate the means and standard deviations of diastolic blood pressure by gender
sum_stat_dia_by_gender <- gender %>%
select(Diastolic, gender_description) %>%
#group by gender_description
group_by(gender_description) %>%
#calculate the means and sds
summarize(mean_dia = mean(Diastolic, na.rm=TRUE), sd_dia = sd(Diastolic, na.rm=TRUE))
print(sum_stat_dia_by_gender)
## # A tibble: 2 × 3
## gender_description mean_dia sd_dia
## <chr> <dbl> <dbl>
## 1 female 71.6 12.2
## 2 male 72.5 12.6
#create a comparative boxplot of median systolic blood pressure by gender
ggplot(gender, aes(Systolic, gender_description)) + geom_boxplot(colour = "blue", fill="light blue", outlier.colour="black", outlier.shape=16, outlier.size=2) + ggtitle("Systolic Blood Pressure by Gender")
## Warning: Removed 1304 rows containing non-finite values (stat_boxplot).

#create a comparative boxplot of median diastolic blood pressure by gender
ggplot(gender, aes(Diastolic, gender_description)) + geom_boxplot(colour = "blue", fill="light blue", outlier.colour="black", outlier.shape=16, outlier.size=2) + ggtitle("Diastolic Blood Pressure by Gender")
## Warning: Removed 1304 rows containing non-finite values (stat_boxplot).

#conduct a comparison of means test for systolic blood pressure
t.test(Systolic~gender_description, data=gender, equal.var=FALSE)
##
## Welch Two Sample t-test
##
## data: Systolic by gender_description
## t = -12.506, df = 10270, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group female and group male is not equal to 0
## 95 percent confidence interval:
## -5.627164 -4.102184
## sample estimates:
## mean in group female mean in group male
## 117.5892 122.4539
#conduct a comparison of means test for diastolic blood pressure
t.test(Diastolic~gender_description, data=gender, equal.var=FALSE)
##
## Welch Two Sample t-test
##
## data: Diastolic by gender_description
## t = -3.6193, df = 10332, p-value = 0.0002968
## alternative hypothesis: true difference in means between group female and group male is not equal to 0
## 95 percent confidence interval:
## -1.3613409 -0.4048084
## sample estimates:
## mean in group female mean in group male
## 71.59728 72.48035
#comparison of (two) means t-test
#the second t-test compares the blood pressure averages of those who have health insurance and of those who do not to see whether there is a significant differences
#load the insurance data frame
health_insurance <- read.csv("/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project/health_insurance.csv")
dim(health_insurance)
## [1] 11629 9
head(health_insurance)
## X Respondent.sequence.number. Systolic Diastolic Reading
## 1 1 109264 109 67 1st reading
## 2 2 109266 99 56 1st reading
## 3 3 109270 123 73 1st reading
## 4 4 109271 102 65 1st reading
## 5 5 109273 116 68 1st reading
## 6 6 109274 138 70 1st reading
## sys_blood_pressure_level dia_blood_pressure_level health.insurance
## 1 normal normal 1
## 2 normal normal 1
## 3 elevated normal 1
## 4 normal normal 1
## 5 normal normal 1
## 6 high normal 1
## coverd_by_health_insurance
## 1 yes
## 2 yes
## 3 yes
## 4 yes
## 5 yes
## 6 yes
names(health_insurance)
## [1] "X" "Respondent.sequence.number."
## [3] "Systolic" "Diastolic"
## [5] "Reading" "sys_blood_pressure_level"
## [7] "dia_blood_pressure_level" "health.insurance"
## [9] "coverd_by_health_insurance"
#for assumption, check that the sample size is greater than 40 for both yes and no groups (answer to the question of whether one is covered by insurance)
table(health_insurance$coverd_by_health_insurance)
##
## no yes
## 1598 10031
#calculate the means and standard deviations of systolic blood pressure by coverd_by_health_insurance (yes/no)
sum_stat_sys_by_insurance <- health_insurance %>%
select(Systolic, coverd_by_health_insurance) %>%
#group by coverd_by_health_insurance
group_by(coverd_by_health_insurance) %>%
#calculate the means and sds
summarize(mean_sys = mean(Systolic, na.rm=TRUE), sd_sys = sd(Systolic, na.rm=TRUE))
print(sum_stat_sys_by_insurance)
## # A tibble: 2 × 3
## coverd_by_health_insurance mean_sys sd_sys
## <chr> <dbl> <dbl>
## 1 no 121. 19.7
## 2 yes 120. 20.0
#calculate the means and standard deviations of diastolic blood pressure by coverd_by_health_insurance (yes/no)
sum_stat_dia_by_insurance <- health_insurance %>%
select(Diastolic, coverd_by_health_insurance) %>%
#group by coverd_by_health_insurance
group_by(coverd_by_health_insurance) %>%
#calculate the means and sds
summarize(mean_dia = mean(Diastolic, na.rm=TRUE), sd_dia = sd(Diastolic, na.rm=TRUE))
print(sum_stat_dia_by_insurance)
## # A tibble: 2 × 3
## coverd_by_health_insurance mean_dia sd_dia
## <chr> <dbl> <dbl>
## 1 no 74.5 13.2
## 2 yes 71.7 12.2
#create a comparative boxplot of median systolic blood pressure by coverd_by_health_insurance
ggplot(health_insurance, aes(Systolic, coverd_by_health_insurance)) + geom_boxplot(colour = "blue", fill="light blue", outlier.colour="black", outlier.shape=16, outlier.size=2) + ggtitle("Systolic Blood Pressure by Insurance")
## Warning: Removed 1302 rows containing non-finite values (stat_boxplot).

#create a comparative boxplot of median diastolic blood pressure by coverd_by_health_insurance
ggplot(health_insurance, aes(Diastolic, coverd_by_health_insurance)) + geom_boxplot(colour = "blue", fill="light blue", outlier.colour="black", outlier.shape=16, outlier.size=2) + ggtitle("Diastolic Blood Pressure by Insurance")
## Warning: Removed 1302 rows containing non-finite values (stat_boxplot).

#conduct a comparison of means test for systolic blood pressure
t.test(Systolic~coverd_by_health_insurance, data=health_insurance, equal.var=FALSE)
##
## Welch Two Sample t-test
##
## data: Systolic by coverd_by_health_insurance
## t = 2.1272, df = 1842.2, p-value = 0.03353
## alternative hypothesis: true difference in means between group no and group yes is not equal to 0
## 95 percent confidence interval:
## 0.094783 2.334828
## sample estimates:
## mean in group no mean in group yes
## 121.0595 119.8447
#conduct a comparison of means test for diastolic blood pressure
t.test(Diastolic~coverd_by_health_insurance, data=health_insurance, equal.var=FALSE)
##
## Welch Two Sample t-test
##
## data: Diastolic by coverd_by_health_insurance
## t = 7.5038, df = 1760.3, p-value = 9.789e-14
## alternative hypothesis: true difference in means between group no and group yes is not equal to 0
## 95 percent confidence interval:
## 2.101401 3.588650
## sample estimates:
## mean in group no mean in group yes
## 74.49746 71.65244
#chi-square test
#the first chi-square test compares blood pressure level proportions by race to see if there is an association between high blood pressure and race
#load the race data frame
race <- read.csv("/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project/race.csv")
dim(race)
## [1] 11656 9
head(race)
## X Respondent.sequence.number. Systolic Diastolic Reading
## 1 1 109264 109 67 1st reading
## 2 2 109266 99 56 1st reading
## 3 3 109270 123 73 1st reading
## 4 4 109271 102 65 1st reading
## 5 5 109273 116 68 1st reading
## 6 6 109274 138 70 1st reading
## sys_blood_pressure_level dia_blood_pressure_level race race_description
## 1 normal normal 1 Mexican American
## 2 normal normal 6 Asian
## 3 elevated normal 4 Black
## 4 normal normal 3 White
## 5 normal normal 3 White
## 6 high normal 7 Other Race
names(race)
## [1] "X" "Respondent.sequence.number."
## [3] "Systolic" "Diastolic"
## [5] "Reading" "sys_blood_pressure_level"
## [7] "dia_blood_pressure_level" "race"
## [9] "race_description"
#get rid of the NA values
race <- na.omit(race)
#create tables that are by race and by blood pressure levels
table_sys <- table(race$sys_blood_pressure_level, race$race_description)
print(table_sys)
##
## Asian Black Mexican American Other Hispanic Other Race White
## elevated 212 449 222 168 94 620
## high 267 910 238 269 120 929
## normal 640 1408 812 575 414 2005
table_dia <- table(race$dia_blood_pressure_level, race$race_description)
print(table_dia)
##
## Asian Black Mexican American Other Hispanic Other Race White
## high 276 891 247 242 145 848
## normal 843 1876 1025 770 483 2706
#create bar charts to compare proportions
ggplot(race) + geom_bar(mapping = aes(x=race_description, fill = sys_blood_pressure_level), width=0.95, alpha=0.80) + labs(x="race")

ggplot(race) + geom_bar(mapping = aes(x=race_description, fill = dia_blood_pressure_level), width=0.95, alpha=0.80) + labs(x="race")

#conduct chi-square tests of independence fo systolic blood pressure levels
chisq.test(race$sys_blood_pressure_level, race$race_description)
##
## Pearson's Chi-squared test
##
## data: race$sys_blood_pressure_level and race$race_description
## X-squared = 131.21, df = 10, p-value < 2.2e-16
#store the test to create a table of the expected counts
Xsq_sys <- chisq.test(race$sys_blood_pressure_level, race$race_description)
Xsq_sys$expected
## race$race_description
## race$sys_blood_pressure_level Asian Black Mexican American
## elevated 190.7878 471.7692 216.8740
## high 295.4238 730.5072 335.8168
## normal 632.7884 1564.7235 719.3091
## race$race_description
## race$sys_blood_pressure_level Other Hispanic Other Race White
## elevated 172.5444 107.0730 605.9515
## high 267.1750 165.7964 938.2807
## normal 572.2805 355.1306 2009.7678
#conduct chi-square tests of independence fo diastolic blood pressure levels
chisq.test(race$dia_blood_pressure_level, race$race_description)
##
## Pearson's Chi-squared test
##
## data: race$dia_blood_pressure_level and race$race_description
## X-squared = 98.599, df = 5, p-value < 2.2e-16
#store the test to create a table of the expected counts
Xsq_dia <-chisq.test(race$dia_blood_pressure_level, race$race_description)
Xsq_dia$expected
## race$race_description
## race$dia_blood_pressure_level Asian Black Mexican American
## high 286.3438 708.0548 325.4954
## normal 832.6562 2058.9452 946.5046
## race$race_description
## race$dia_blood_pressure_level Other Hispanic Other Race White
## high 258.9633 160.7005 909.4422
## normal 753.0367 467.2995 2644.5578
#chi-square test
#the second chi-square test compares blood pressure level proportions by education level to see if there is an association between high blood pressures and education
#load the education data frame
education <- read.csv("/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project/education.csv")
dim(education)
## [1] 11656 9
head(education)
## X Respondent.sequence.number. Systolic Diastolic Reading
## 1 1 109264 109 67 1st reading
## 2 2 109266 99 56 1st reading
## 3 3 109270 123 73 1st reading
## 4 4 109271 102 65 1st reading
## 5 5 109273 116 68 1st reading
## 6 6 109274 138 70 1st reading
## sys_blood_pressure_level dia_blood_pressure_level education
## 1 normal normal NA
## 2 normal normal 5
## 3 elevated normal NA
## 4 normal normal 2
## 5 normal normal 4
## 6 high normal 4
## education_level
## 1 <NA>
## 2 College graduate or above
## 3 <NA>
## 4 9-11th grade
## 5 Some college
## 6 Some college
names(education)
## [1] "X" "Respondent.sequence.number."
## [3] "Systolic" "Diastolic"
## [5] "Reading" "sys_blood_pressure_level"
## [7] "dia_blood_pressure_level" "education"
## [9] "education_level"
#get rid of the NA values
education <- na.omit(education)
dim(education)
## [1] 7648 9
#remove the "Don't Know" and "Refused" columns as there are too few counts in those columns and keeping them may affect the test
condition_1 <- education$education_level != "Don't Know"
table(condition_1)
## condition_1
## FALSE TRUE
## 4 7644
education <- education[condition_1,]
condition_2 <- education$education_level != "Refused"
table(condition_2)
## condition_2
## FALSE TRUE
## 1 7643
education <- education[condition_2,]
dim(education)
## [1] 7643 9
#create tables that are by education level and by blood pressure levels
table_sys <- table(education$sys_blood_pressure_level, education$education_level)
print(table_sys)
##
## 9-11th grade College graduate or above High school graduate
## elevated 152 423 366
## high 343 548 702
## normal 338 930 790
##
## Less than 9th grade Some college
## elevated 92 519
## high 245 841
## normal 181 1173
table_dia <- table(education$dia_blood_pressure_level, education$education_level)
print(table_dia)
##
## 9-11th grade College graduate or above High school graduate
## high 279 564 641
## normal 554 1337 1217
##
## Less than 9th grade Some college
## high 168 892
## normal 350 1641
#create bar charts to compare proportions
ggplot(education) + geom_bar(mapping = aes(x=education_level, fill = sys_blood_pressure_level), width=0.95, alpha=0.80) + labs(x="education")

ggplot(education) + geom_bar(mapping = aes(x=education_level, fill = dia_blood_pressure_level), width=0.95, alpha=0.80) + labs(x="education")

#conduct chi-square tests of independence for systolic blood pressure levels
chisq.test(education$sys_blood_pressure_level, education$education_level)
##
## Pearson's Chi-squared test
##
## data: education$sys_blood_pressure_level and education$education_level
## X-squared = 91.084, df = 8, p-value = 2.802e-16
#store the test to create a table of the expected counts
Xsq_sys <- chisq.test(education$sys_blood_pressure_level, education$education_level)
Xsq_sys$expected
## education$education_level
## 9-11th grade College graduate or above High school graduate
## elevated 169.1503 386.0201 377.2885
## high 291.9805 666.3325 651.2602
## normal 371.8692 848.6474 829.4513
## education$education_level
## Less than 9th grade Some college
## elevated 105.1859 514.3551
## high 181.5677 887.8591
## normal 231.2464 1130.7858
#conduct chi-square tests of independence for diastolic blood pressure levels
chisq.test(education$dia_blood_pressure_level, education$education_level)
##
## Pearson's Chi-squared test
##
## data: education$dia_blood_pressure_level and education$education_level
## X-squared = 16.865, df = 4, p-value = 0.002053
#store the test to create a table of the expected counts
Xsq_dia <-chisq.test(education$dia_blood_pressure_level, education$education_level)
Xsq_dia$expected
## education$education_level
## 9-11th grade College graduate or above High school graduate
## high 277.267 632.7547 618.442
## normal 555.733 1268.2453 1239.558
## education$education_level
## Less than 9th grade Some college
## high 172.4182 843.1181
## normal 345.5818 1689.8819
#linear regression analysis
#the first linear regression analysis checks whether there is a linear relationship between blood pressures and income
#load the income data frame
income <- read.csv("/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project/income.csv")
dim(income)
## [1] 11656 8
head(income)
## X Respondent.sequence.number. Systolic Diastolic Reading
## 1 1 109264 109 67 1st reading
## 2 2 109266 99 56 1st reading
## 3 3 109270 123 73 1st reading
## 4 4 109271 102 65 1st reading
## 5 5 109273 116 68 1st reading
## 6 6 109274 138 70 1st reading
## sys_blood_pressure_level dia_blood_pressure_level poverty_level_index
## 1 normal normal 1.29
## 2 normal normal 5.00
## 3 elevated normal 1.69
## 4 normal normal 1.20
## 5 normal normal 0.53
## 6 high normal 1.20
names(income)
## [1] "X" "Respondent.sequence.number."
## [3] "Systolic" "Diastolic"
## [5] "Reading" "sys_blood_pressure_level"
## [7] "dia_blood_pressure_level" "poverty_level_index"
#get rid of the NA values
income <- na.omit(income)
#plot the variables to create scatter plots
ggplot(income, aes(poverty_level_index, Systolic)) + geom_point(color = "hotpink", na.rm = TRUE)

ggplot(income, aes(poverty_level_index, Diastolic)) + geom_point(color = "aquamarine4", na.rm = TRUE)

#calculate the correlations between systolic, diastolic blood pressures and income
print(cor(income$poverty_level_index,income$Systolic))
## [1] 0.03808412
print(cor(income$poverty_level_index,income$Diastolic))
## [1] 0.03680397
#create models
sys_inc_model <- lm(Systolic ~ poverty_level_index, data = income)
print(sys_inc_model)
##
## Call:
## lm(formula = Systolic ~ poverty_level_index, data = income)
##
## Coefficients:
## (Intercept) poverty_level_index
## 118.4145 0.4861
dia_inc_model <- lm(Diastolic ~ poverty_level_index, data = income)
print(dia_inc_model)
##
## Call:
## lm(formula = Diastolic ~ poverty_level_index, data = income)
##
## Coefficients:
## (Intercept) poverty_level_index
## 71.150 0.294
#calculate and plot the residuals
sys_residuals<-resid(sys_inc_model)
hist(sys_residuals)

dia_residuals<-resid(dia_inc_model)
hist(dia_residuals)

#Normal probability plots of the residuals
qqnorm(sys_residuals)

qqnorm(dia_residuals)

#Calculate standard residuals
sys_standard_residuals <- rstandard(sys_inc_model)
length(sys_standard_residuals)
## [1] 8345
dia_standard_residuals <- rstandard(dia_inc_model)
length(dia_standard_residuals)
## [1] 8345
dim(income)
## [1] 8345 8
#plot the standard residuals
hist(sys_standard_residuals)

hist(dia_standard_residuals)

#Create standardized residual plots
plot(fitted(sys_inc_model), sys_standard_residuals)

plot(fitted(dia_inc_model), dia_standard_residuals)

#Combine regression variables with standard residuals
sys_income_stdres <- cbind(income, sys_standard_residuals)
dim(sys_income_stdres)
## [1] 8345 9
dia_income_stdres <- cbind(income, dia_standard_residuals)
dim(dia_income_stdres)
## [1] 8345 9
#Remove absolute values of standard residuals that are greater than 2.
sys_income_analysis <- sys_income_stdres[abs(sys_income_stdres$sys_standard_residuals) < 2,]
dim(sys_income_analysis)
## [1] 7938 9
dia_income_analysis <- dia_income_stdres[abs(dia_income_stdres$dia_standard_residuals) < 2,]
dim(dia_income_analysis)
## [1] 7981 9
#histogram of the standard residuals after removing outliers
hist(sys_income_analysis$sys_standard_residuals)

hist(dia_income_analysis$dia_standard_residuals)

#fit the least squares (regression) line to the scatter plots that are without outliers
ggplot(sys_income_analysis, aes(poverty_level_index, Systolic)) + geom_point(color = "hotpink", na.rm = TRUE) + geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(dia_income_analysis, aes(poverty_level_index, Diastolic)) + geom_point(color = "aquamarine4", na.rm = TRUE) + geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

#perform linear regression analyses
summary(lm(Systolic ~ poverty_level_index, data = sys_income_analysis))
##
## Call:
## lm(formula = Systolic ~ poverty_level_index, data = sys_income_analysis)
##
## Residuals:
## Min 1Q Median 3Q Max
## -36.823 -11.863 -2.114 10.445 42.308
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 115.4656 0.3200 360.848 < 2e-16 ***
## poverty_level_index 0.7297 0.1153 6.331 2.56e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.02 on 7936 degrees of freedom
## Multiple R-squared: 0.005026, Adjusted R-squared: 0.004901
## F-statistic: 40.09 on 1 and 7936 DF, p-value: 2.561e-10
summary(lm(Diastolic ~ poverty_level_index, data = dia_income_analysis))
##
## Call:
## lm(formula = Diastolic ~ poverty_level_index, data = dia_income_analysis)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.9257 -7.9257 -0.5485 7.5868 25.8428
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 69.96940 0.21140 330.978 < 2e-16 ***
## poverty_level_index 0.39127 0.07652 5.113 3.24e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.6 on 7979 degrees of freedom
## Multiple R-squared: 0.003266, Adjusted R-squared: 0.003141
## F-statistic: 26.15 on 1 and 7979 DF, p-value: 3.236e-07
#the next linear regression analysis checks whether there is a linear relationship between blood pressures and sleep
#load the sleep data frame
sleep <- read.csv("/Users/kaimihuang/Documents/Study/STATUN2102_001_2022_1 - Applied Statistical Computing/Project/sleep.csv")
dim(sleep)
## [1] 11656 8
head(sleep)
## X Respondent.sequence.number. Systolic Diastolic Reading
## 1 1 109264 109 67 1st reading
## 2 2 109266 99 56 1st reading
## 3 3 109270 123 73 1st reading
## 4 4 109271 102 65 1st reading
## 5 5 109273 116 68 1st reading
## 6 6 109274 138 70 1st reading
## sys_blood_pressure_level dia_blood_pressure_level workday_sleep_hrs
## 1 normal normal NA
## 2 normal normal 7.5
## 3 elevated normal NA
## 4 normal normal 10.0
## 5 normal normal 6.5
## 6 high normal 9.5
names(sleep)
## [1] "X" "Respondent.sequence.number."
## [3] "Systolic" "Diastolic"
## [5] "Reading" "sys_blood_pressure_level"
## [7] "dia_blood_pressure_level" "workday_sleep_hrs"
#get rid of the NA values
sleep <- na.omit(sleep)
#plot the variables to create scatter plots
ggplot(sleep, aes(workday_sleep_hrs, Systolic)) + geom_point(color = "hotpink", na.rm = TRUE)

ggplot(sleep, aes(workday_sleep_hrs, Diastolic)) + geom_point(color = "aquamarine4", na.rm = TRUE)

#calculate the correlations between systolic, diastolic blood pressures and sleep
print(cor(sleep$workday_sleep_hrs,sleep$Systolic))
## [1] -0.02701154
print(cor(sleep$workday_sleep_hrs,sleep$Diastolic))
## [1] -0.06131704
#create models
sys_sleep_model <- lm(Systolic ~ workday_sleep_hrs, data = sleep)
print(sys_sleep_model)
##
## Call:
## lm(formula = Systolic ~ workday_sleep_hrs, data = sleep)
##
## Coefficients:
## (Intercept) workday_sleep_hrs
## 126.0649 -0.3178
dia_sleep_model <- lm(Diastolic ~ workday_sleep_hrs, data = sleep)
print(dia_sleep_model)
##
## Call:
## lm(formula = Diastolic ~ workday_sleep_hrs, data = sleep)
##
## Coefficients:
## (Intercept) workday_sleep_hrs
## 77.6682 -0.4367
#calculate and plot the residuals
sys_residuals<-resid(sys_sleep_model)
hist(sys_residuals)

dia_residuals<-resid(dia_sleep_model)
hist(dia_residuals)

#Normal probability plots of the residuals
qqnorm(sys_residuals)

qqnorm(dia_residuals)

#Calculate standard residuals
sys_standard_residuals <- rstandard(sys_sleep_model)
length(sys_standard_residuals)
## [1] 8391
dia_standard_residuals <- rstandard(dia_sleep_model)
length(dia_standard_residuals)
## [1] 8391
dim(sleep)
## [1] 8391 8
#plot the standard residuals
hist(sys_standard_residuals)

hist(dia_standard_residuals)

#Create standardized residual plots
plot(fitted(sys_sleep_model), sys_standard_residuals)

plot(fitted(dia_sleep_model), dia_standard_residuals)

#Combine regression variables with standard residuals
sys_sleep_stdres <- cbind(sleep, sys_standard_residuals)
dim(sys_sleep_stdres)
## [1] 8391 9
dia_sleep_stdres <- cbind(sleep, dia_standard_residuals)
dim(dia_sleep_stdres)
## [1] 8391 9
#Remove absolute values of standard residuals that are greater than 2.
sys_sleep_analysis <- sys_sleep_stdres[abs(sys_sleep_stdres$sys_standard_residuals) < 2,]
dim(sys_sleep_analysis)
## [1] 7998 9
dia_sleep_analysis <- dia_sleep_stdres[abs(dia_sleep_stdres$dia_standard_residuals) < 2,]
dim(dia_sleep_analysis)
## [1] 8019 9
#histogram of the standard residuals after removing outliers
hist(sys_sleep_analysis$sys_standard_residuals)

hist(dia_sleep_analysis$dia_standard_residuals)

#fit the least squares (regression) line to the scatter plots that are without outliers
ggplot(sys_sleep_analysis, aes(workday_sleep_hrs, Systolic)) + geom_point(color = "hotpink", na.rm = TRUE) + geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(dia_sleep_analysis, aes(workday_sleep_hrs, Diastolic)) + geom_point(color = "aquamarine4", na.rm = TRUE) + geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

#perform linear regression analyses
summary(lm(Systolic ~ workday_sleep_hrs, data = sys_sleep_analysis))
##
## Call:
## lm(formula = Systolic ~ workday_sleep_hrs, data = sys_sleep_analysis)
##
## Residuals:
## Min 1Q Median 3Q Max
## -36.793 -11.965 -1.879 10.465 41.584
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 124.0730 0.8408 147.561 < 2e-16 ***
## workday_sleep_hrs -0.3657 0.1081 -3.381 0.000725 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.06 on 7996 degrees of freedom
## Multiple R-squared: 0.001428, Adjusted R-squared: 0.001303
## F-statistic: 11.43 on 1 and 7996 DF, p-value: 0.0007251
summary(lm(Diastolic ~ workday_sleep_hrs, data = dia_sleep_analysis))
##
## Call:
## lm(formula = Diastolic ~ workday_sleep_hrs, data = dia_sleep_analysis)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.9057 -7.4218 -0.4218 7.0943 24.3847
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 77.51872 0.52905 146.524 < 2e-16 ***
## workday_sleep_hrs -0.51615 0.06807 -7.583 3.76e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.08 on 8017 degrees of freedom
## Multiple R-squared: 0.007121, Adjusted R-squared: 0.006997
## F-statistic: 57.5 on 1 and 8017 DF, p-value: 3.76e-14